Contents

library(tidyverse)
library(magrittr)
library(pheatmap)

1 Load data

1.1 Normalized counts

counts <- read.delim(paste0(dir, "Data/count_data.csv"), sep = ",")
mean_counts <- read.delim(paste0(dir, "Data/mean_counts.csv"), sep = ",") %>%
  group_by(gene, age_group) %>%
  summarize(counts = mean(counts))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
head(mean_counts)
## # A tibble: 6 × 3
## # Groups:   gene [2]
##   gene  age_group counts
##   <chr> <chr>      <dbl>
## 1 A1BG  1-15       143. 
## 2 A1BG  16-26      130. 
## 3 A1BG  27-60      111. 
## 4 A1BG  61-85      134. 
## 5 A1BG  86-96       45.5
## 6 A2M   1-15       465.

1.2 TF-target interactions

TF_targets <- read.delim(paste0(dir, "Data/tf-target-information.txt")) %>%
  subset(select = c("TF", "target")) %>%
  distinct() %>%
  drop_na()
head(TF_targets)
##      TF      target
## 1 AEBP2      TMEM53
## 2 AEBP2    C1orf228
## 3 AEBP2      FBXO31
## 4 AEBP2    ADAMTSL5
## 5 AEBP2 CTB-25B13.9
## 6 AEBP2     MIR6791

1.3 Differentially expressed genes

p_0.05 <- read.delim(paste0(dir, "Data/DE_var_p_n_200.csv"), sep = ",")
head(p_0.05)
##    gene     transition abs_log2_fc
## 1   A2M  fc_1-15_16-26   2.0837541
## 2   A2M fc_16-26_27-60   2.1870559
## 3  AARD  fc_1-15_16-26   1.7421494
## 4 ABCA3 fc_27-60_61-85   0.8601670
## 5 ABCA5 fc_16-26_27-60   0.6800931
## 6 ABCA5 fc_61-85_86-96   1.8283348

2 Hierarchical clustering

2.1 Select DE targets of DE TFs

# data frame to map to the following transition
transitions <- data.frame(TF_transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85"), 
                          target_transition = c("fc_16-26_27-60", "fc_27-60_61-85", "fc_61-85_86-96"))

# select DE TFs and their DE targets in the following transition
DE_TFs_targets <- filter(p_0.05, gene %in% TF_targets$TF) %>%
  filter(transition != "fc_61-85_86-96") %>%
  dplyr::rename(TF = gene, TF_transition = transition, TF_fc = abs_log2_fc) %>%
  left_join(transitions) %>%
  left_join(TF_targets) %>%
  left_join(dplyr::rename(p_0.05, target = gene, target_transition = transition)) %>%
  drop_na(abs_log2_fc)
## Joining, by = "TF_transition"
## Joining, by = "TF"
## Joining, by = c("target_transition", "target")
head(DE_TFs_targets)
##    TF TF_transition    TF_fc target_transition  target abs_log2_fc
## 1 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60   MOAP1  22.5642379
## 2 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60   WNT2B   1.1286318
## 3 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60    RRN3  23.9218949
## 4 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60  NIPAL3   0.4870438
## 5 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60 COL16A1   0.5443583
## 6 ERG fc_1-15_16-26 1.899511    fc_16-26_27-60  ZNF697   0.5524617
group_by(DE_TFs_targets, TF) %>% summarize(n_DE_targets = n()) %>% 
  arrange(desc(n_DE_targets))
## # A tibble: 13 × 2
##    TF     n_DE_targets
##    <chr>         <int>
##  1 MYH11            99
##  2 ERG              86
##  3 FOS              58
##  4 TFAP2C           55
##  5 MECOM            24
##  6 PGR              20
##  7 RUNX2            15
##  8 MAFK              9
##  9 FOXP2             6
## 10 TCF7L2            3
## 11 MEIS1             2
## 12 HOXA13            1
## 13 NR5A2             1

2.2 Expression of targets

expr_DE_targets <- left_join(DE_TFs_targets, dplyr::rename(mean_counts, target = gene)) %>%
  filter(TF == "FOS") %>%
  subset(select = c(target, age_group, counts)) %>%
  #drop_na() %>%
  spread(key = age_group, value = counts) 
## Joining, by = "target"
head(expr_DE_targets)
##     target      1-15     16-26     27-60     61-85      86-96
## 1    ACTN4 8659.1706 8282.4531 7337.9509 8586.8436 5003.89874
## 2     ANLN 5757.1681 6237.0074 6537.9018 5922.7134  932.15686
## 3     BUB1 1233.4397 1263.9336 1299.8837 1117.4248  220.35056
## 4 C1orf159  145.7017  128.9521  119.3703  139.6103   42.86685
## 5 C21orf58  240.3914  208.6302  205.2445  187.6005   54.64397
## 6  C7orf50  926.5183  825.3168  735.9493  816.6335  263.22009

2.3 Calculate distance and perform clustering

# Create matrix
hclust_matrix <- select(expr_DE_targets, -target) %>%
  as.matrix()
rownames(hclust_matrix) <- expr_DE_targets$target

# Z-transform values for each gene
hclust_matrix %<>% t() %>% scale() %>% t()

# Calculate euclidean distances between genes
gene_dist <- dist(hclust_matrix)

# Clustering
gene_hclust <- hclust(gene_dist, method = "ward.D2")

# Plot dendrogram
plot(gene_hclust, labels = FALSE)

4 Difference map: young - old fibroblasts for FOS DE targets

# Read in difference map for FOS DE targets
diff <- read.delim(paste0(dir, "Data/difference_maps/diff_FOS_DE_targets.csv"), sep = ",") %>%
  gather(key = 'chr2', value = 'difference', -chr1)

# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
  left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
  dplyr::rename('locus' = 'chr1') %>% 
  subset(select = c(locus, n, group)) %>%
  filter(n >= 7)
## Joining, by = "difference"
head(diff_counts)
## # A tibble: 6 × 3
## # Groups:   locus [6]
##   locus                   n group         
##   <chr>               <int> <chr>         
## 1 chr_1_loc_1000000       7 young-specific
## 2 chr_1_loc_1250000       7 young-specific
## 3 chr_11_loc_65000000     7 young-specific
## 4 chr_17_loc_78000000     7 old-specific  
## 5 chr_20_loc_33500000     8 young-specific
## 6 chr_21_loc_46250000     7 young-specific
# Map loci to genes
loci <- unique(diff$chr1)
gene_loci <- read.delim(paste0(dir, "Data/all_gene_loci.csv"), sep = ",") %>%
  filter(gene %in% expr_DE_targets$target) %>%
  left_join(diff_counts) %>% 
  mutate(group = replace_na(group, "no_differences"))
## Joining, by = "locus"
# Add expression values
expr_targets <- left_join(gene_loci, mean_counts) %>%
  subset(select = c(gene, age_group, counts, group)) %>%
  group_by(gene) %>%
  mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~group, nrow = 1)+
  geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
  theme_bw()

5 Difference map for loci from last PCST network

5.1 Expression patterns for difference groups

# Read in difference map for last PCST loci and Louvain clusters
diff <- read.delim(paste0(dir, "Data/difference_maps/diff_last_PCST_loci.csv"), sep = ",") %>%
  gather(key = 'chr2', value = 'difference', -chr1)
clusters <- read.delim(paste0(dir, 'Data/difference_maps/louvain_clusters_last_PCST.csv'), sep = ",") 
#clusters %<>% mutate(cluster = sample(clusters$cluster, nrow(clusters)))

# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
  left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
  dplyr::rename('locus' = 'chr1') %>% 
  subset(select = c(locus, n, group)) %>%
  filter(n >= 40)
## Joining, by = "difference"
# Add number of differences to the annotation df
gene_loci <- clusters %>%
  left_join(diff_counts) %>% 
  mutate(group = replace_na(group, "no_differences"))
## Joining, by = "locus"
# Add expression values
expr_targets <- left_join(gene_loci, mean_counts) %>%
  subset(select = c(gene, locus, group, cluster, age_group, counts)) %>%
  group_by(gene) %>%
  mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~group, nrow = 1)+
  geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
  theme_bw()

5.2 Expression patterns per Louvain cluster

ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~cluster, nrow = 2)+
  geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
  theme_bw()+
  theme(text = element_text(size = 15))

# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
  left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
  dplyr::rename('locus' = 'chr1') %>% 
  subset(select = c(locus, n, group))
## Joining, by = "difference"
# Get top 10 loci per cluster (with most differences)
selection <- clusters %>%
  left_join(diff_counts) %>% 
  group_by(cluster) %>% 
  slice_max(n, n = 10, with_ties = FALSE)
## Joining, by = "locus"
# Expression patterns only for the top 10 loci per cluster
expr_targets_filt <- left_join(selection, mean_counts) %>%
  subset(select = c(gene, locus, group, cluster, age_group, counts)) %>%
  group_by(gene) %>%
  mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets_filt, aes(x = age_group, y = counts, group = gene)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~cluster)+
  geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
  theme_bw()+
  theme(text = element_text(size = 15))

5.3 Distribution of differences among clusters

clusters_diff <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
  left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
  dplyr::rename('locus' = 'chr1') %>%
  left_join(clusters) %>%
  mutate(group = factor(group, levels = c('young-specific', 'old-specific'))) %>%
  group_by(group, cluster) %>%
  mutate(mean = mean(n))
## Joining, by = "difference"
## Joining, by = "locus"
ggplot(clusters_diff, aes(x = n)) + 
  geom_histogram() +
  facet_grid(rows = vars(group), cols = vars(cluster), scale = "free_x") +
  geom_vline(aes(xintercept = mean, group = cluster), colour = 'red') +
  theme_bw() +
  xlab('number of differences') +
  ylab('number of loci')+
  theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.4 Distribution of differences per locus

clusters_diff_wide <- subset(clusters_diff, select = c(locus, gene, cluster, group, n)) %>%
  spread(key = group, value = n, fill = 0) %>%
  mutate(cluster = factor(cluster, levels = seq(0, 5, 1)))
colnames(clusters_diff_wide) <- c('locus', 'gene', 'cluster', 'young_specific', 'old_specific')

ggplot(clusters_diff_wide, aes(x = young_specific, y = old_specific, color = cluster)) +
  geom_point() +
  xlab('young-specific interactions per locus') +
  ylab('old-specific interactions per locus') +
  facet_wrap(~cluster) +
  theme_bw()+
  theme(text = element_text(size = 17)) +
  geom_vline(xintercept = 10) +
  geom_hline(yintercept = 10)

5.5 Loci with many young and old specific interactions

switching_loci <- filter(clusters_diff_wide, young_specific > 10, old_specific > 10)

5.6 Clustering according to expression values

# Create matrix
hclust_matrix <- select(ungroup(expr_targets), -c(group, locus, cluster)) %>%
  spread(key = age_group, value = counts) %>%
  column_to_rownames('gene') %>%
  as.matrix()

# Calculate euclidean distances between genes
gene_dist <- dist(hclust_matrix)

# Clustering
gene_hclust <- hclust(gene_dist, method = "ward.D2")

# Plot dendrogram
plot(gene_hclust, labels = FALSE)

gene_cluster <- cutree(gene_hclust, k = 6) %>% 
  enframe() %>% 
  dplyr::rename(gene = name, cluster = value) %>%
  left_join(group_by(mean_counts, gene) %>% mutate(counts = scale(counts)))
## Joining, by = "gene"
ggplot(gene_cluster, aes(x = age_group, y = counts, group = gene)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~cluster, nrow = 2)+
  geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
  theme_bw()+
  theme(text = element_text(size = 15))

5.7 TF enrichment per Louvain cluster

targets_per_cluster <- left_join(clusters, dplyr::rename(TF_targets, 'gene' = 'target')) %>%
  group_by(TF, cluster) %>%
  summarize(number_of_targets = n()) %>%
  drop_na()
## Joining, by = "gene"
## `summarise()` has grouped output by 'TF'. You can override using the `.groups`
## argument.
head(targets_per_cluster)
## # A tibble: 6 × 3
## # Groups:   TF [3]
##   TF    cluster number_of_targets
##   <chr>   <int>             <int>
## 1 AFF4        0                 1
## 2 AHR         0                 3
## 3 AHR         1                 2
## 4 AHR         2                 2
## 5 AHR         3                 1
## 6 AR          0                15
# for normalization: number of genes per cluster and percentages that each TF targets
genes_per_cluster <- group_by(clusters, cluster) %>% summarize(n = n())
# percentage of targets per TF
percentages <- targets_per_cluster %>%
  group_by(TF) %>%
  summarize(number_of_targets = sum(number_of_targets)) %>%
  mutate(percentage_targets = number_of_targets / length(clusters$gene))

wide <- spread(targets_per_cluster, key = cluster, value = number_of_targets, fill = 0) %>%
  column_to_rownames('TF')
colnames(wide) <- c('cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5')

# enrichment score: percentage of genes targeted in the cluster / percentages of genes targeted overall (in the PCST genes)
wide <- t(t(wide)/genes_per_cluster$n) 
#wide <- wide / percentages$percentage_targets
  
#pheatmap(wide, show_rownames = F, clustering_method = 'ward.D2', 
#         color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100))
pheatmap(wide, show_rownames = F, clustering_method = 'ward.D2', scale = "row")

–> There are groups of TFs that only target genes in one cluster specifically. Do these TF groups overlap with the ones we identified in our PCST analysis?

# load info about which TFs occurred in which of the three network
pcst_TFs = read.delim(paste0(dir, 'Data/pcst/incl_TFs_design2.csv'), sep = ",") %>%
  mutate(included = "True") %>%
  mutate(net = factor(net, levels = c('young_net', 'middle_net', 'old_net'))) %>% #keeps ordering
  spread(key = net, value = included, fill = "False") %>%
  right_join(data.frame('TF' = rownames(wide))) %>%
  replace(is.na(.), 'False') %>%
  column_to_rownames('TF')
## Joining, by = "TF"
head(pcst_TFs)
##       young_net middle_net old_net
## AHR        True       True    True
## AR        False       True   False
## ARNT      False      False    True
## ASCL1      True       True   False
## ASCL2     False       True    True
## ATF1       True       True    True
# add annotation bars according to PCST
annoCol<-list(young_net=c(True="blue", False="grey"), 
              middle_net = c(True = 'red', False = 'grey'), 
              old_net = c(True = 'green', False = 'grey'))

pheatmap(wide, show_rownames = F, clustering_method = 'average',
         annotation_row = pcst_TFs, annotation_colors = annoCol, scale = "row")